In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
TO DO:
In [2]:
%matplotlib inline
In [3]:
from bigbang.archive import Archive
arx = Archive("scipy-user",archive_dir="../archives")
Get the activity of a list
In [4]:
act = arx.get_activity()
Since are going to be computing correlations between N different time series data sets and that's an $O(N^2)$ operation, let's limit N.
In [5]:
cutoff = 20
def filtered_participants(cutoff):
xc = act.sum() > cutoff
return act.columns[xc]
filtered_participants(cutoff)[:10]
Out[5]:
In [6]:
from scipy.stats.stats import pearsonr
fc = filtered_participants(cutoff)
n = len(fc)
pc = np.zeros([n,n])
for i in range(0,n):
for j in range(i,n):
# since pearson correlation assumes normally distributed data
# and we believe activity to be log-normally distributed (see Shalizi...)
# we'll take the log of values here
si = np.log1p(act[fc[i]])
sj = np.log1p(act[fc[j]])
c = pearsonr(si,sj)[0]
pc[i,j] = c
pc[j,i] = c
pc
Out[6]:
In [7]:
G = nx.Graph(pc)
labels = dict(enumerate(fc))
G = nx.relabel_nodes(G,labels)
In [8]:
plt.imshow(pc)
Out[8]:
In [9]:
# from http://sociograph.blogspot.com/2012/11/visualizing-adjacency-matrices-in-python.html
import networkx as nx
from matplotlib import pyplot, patches
def draw_adjacency_matrix(G, node_order=None, partitions=[], colors=[],cmap="Greys"):
"""
- G is a networkx graph
- node_order (optional) is a list of nodes, where each node in G
appears exactly once
- partitions is a list of node lists, where each node in G appears
in exactly one node list
- colors is a list of strings indicating what color each
partition should be
If partitions is specified, the same number of colors needs to be
specified.
"""
adjacency_matrix = nx.to_numpy_matrix(G, dtype=np.bool, nodelist=node_order)
#Plot adjacency matrix in toned-down black and white
fig = pyplot.figure(figsize=(5, 5)) # in inches
pyplot.imshow(adjacency_matrix,
cmap=cmap,
interpolation="none")
# The rest is just if you have sorted nodes by a partition and want to
# highlight the module boundaries
assert len(partitions) == len(colors)
ax = pyplot.gca()
current_idx = 0
for partition, color in zip(partitions, colors):
#for module in partition:
ax.add_patch(patches.Rectangle((current_idx, current_idx),
len(partition), # Width
len(partition), # Height
facecolor="none",
edgecolor=color,
linewidth="1"))
current_idx += len(partition)
In [10]:
# order nodes by number of messages sent by each participant
o = list(act[fc].sum().order(ascending=False).keys())
draw_adjacency_matrix(G,node_order=o)
In [11]:
from sklearn import cluster
n_clusters = 5
sc = cluster.SpectralClustering(n_clusters=n_clusters)
partition = sc.fit_predict(pc)
partition_order = [fc[x] for x in np.argsort(partition)]
In [12]:
np.sort(partition)
Out[12]:
In [13]:
from collections import defaultdict
parts = defaultdict(list)
for i in range(len(partition)):
parts[partition[i]].append(fc[i])
colors = [plt.cm.hsv(1. / (k + 1)) for k in parts.keys()]
In [14]:
colors
Out[14]:
In [15]:
len(parts.values())
Out[15]:
In [16]:
draw_adjacency_matrix(G,
node_order=partition_order,
partitions=parts.values(),
colors=colors)
In [17]:
for k,v in parts.items():
print str(k) + ": " + str(len(v))
In [18]:
node_parts = {}
for k,v in parts.items():
for n in v:
node_parts[n] = int(k)
In [19]:
import networkx as nx
#FG = nx.Graph(pc > .01)
FG = nx.from_numpy_matrix(pc * (pc > 0))
labels = dict(enumerate(fc))
FG = nx.relabel_nodes(FG,labels)
nx.set_node_attributes(FG,'corr-cluster',node_parts)
pos = nx.spring_layout(FG, k = 0.6, iterations = 1000)
nx.draw(FG,pos)
nx.write_gexf(FG,'corr.gexf')
In [21]:
from bigbang import plot
plot.stack(act,partition=parts.values(),smooth=7)